Stabilizing Energy Supplies and Forecasting Future Energy Use

Group 9
2022, April 28

Preliminary setup

Load the library package, :

Data loading

We collected the past ten years data from PJM interconnection LLC, which is a regional transmission organization, which serves thirteen states for electric transmission.

setwd('/Users/meizhuchen/Desktop/Columbia/Columbia_S2/APAN_5205/Project/')
data2012 <- read.csv("hrl_load_estimated_2012.csv")
data2013 <- read.csv("hrl_load_estimated_2013.csv")
data2014 <- read.csv("hrl_load_estimated_2014.csv")
data2015 <- read.csv("hrl_load_estimated_2015.csv")
data2016 <- read.csv("hrl_load_estimated_2016.csv")
data2017 <- read.csv("hrl_load_estimated_2017.csv")
data2018 <- read.csv("hrl_load_estimated_2018.csv")
data2019 <- read.csv("hrl_load_estimated_2019.csv")
data2020 <- read.csv("hrl_load_estimated_2020.csv")
data2021 <- read.csv("hrl_load_estimated_2021.csv")

Integrate the past ten years data

data<-rbind(data2012,data2013,data2014,data2015,data2016,data2017,data2018,data2019,data2020,data2021)

Data exploration

There are 87,673 rows and 6 variables in our dataset, including datetime_beginning_utc, datetime_beginning_ept, datetime_ending_utc, datetime_ending_ept, load_area, and estimated_load_hourly.

glimpse(data)
Rows: 87,673
Columns: 6
$ datetime_beginning_utc <chr> "2/24/2012 5:00:00 AM", "2/24/2012 6:…
$ datetime_beginning_ept <chr> "2/24/2012 12:00:00 AM", "2/24/2012 1…
$ datetime_ending_utc    <chr> "2/24/2012 6:00:00 AM", "2/24/2012 7:…
$ datetime_ending_ept    <chr> "2/24/2012 1:00:00 AM", "2/24/2012 2:…
$ load_area              <chr> "AEP", "AEP", "AEP", "AEP", "AEP", "A…
$ estimated_load_hourly  <int> 13640, 13245, 13065, 13063, 13255, 14…

Delete the unnecessary columns

Since we just need to know the datetime of energy load hourly, we don’t need the ending datetime variables, so we deleted the datetime_ending_utc and datetime_ending_ept variables. Besides, we focused on the data from the American Electric Power company in our research which covers the Eastern part of the U.S, we just need the EPT datetime, so we also deleted the datetime_beginning_utc variable.

df<-subset(data, select = c(-datetime_beginning_utc,-datetime_ending_utc, -datetime_ending_ept))
glimpse(df)
Rows: 87,673
Columns: 3
$ datetime_beginning_ept <chr> "2/24/2012 12:00:00 AM", "2/24/2012 1…
$ load_area              <chr> "AEP", "AEP", "AEP", "AEP", "AEP", "A…
$ estimated_load_hourly  <int> 13640, 13245, 13065, 13063, 13255, 14…

Change datatype and rename the variable

Changed the datatype of datetime_beginning_ept variable from string to datetime and rename it as datetime

df$datetime_beginning_ept<-mdy_hms(df$datetime_beginning_ept)
colnames(df)[1]="dt"
colnames(df)[3]="energy_use"
glimpse(df)
Rows: 87,673
Columns: 3
$ dt         <dttm> 2012-02-24 00:00:00, 2012-02-24 01:00:00, 2012-0…
$ load_area  <chr> "AEP", "AEP", "AEP", "AEP", "AEP", "AEP", "AEP", …
$ energy_use <int> 13640, 13245, 13065, 13063, 13255, 14042, 15533, …

Check if there are missing values in the data loaded

There is no missing values in the dataset.

colSums(is.na(df))>0
        dt  load_area energy_use 
     FALSE      FALSE      FALSE 

Data visualization

Quick visualize the data to see if there is any unusual situation in the dataset.

ggplot(df, aes(x=dt,y=energy_use))+geom_line()

Remove the outliner from the dataset.

We found out that there was an unusual energy usage at the end of 2012, so we decided to remove the data from 2012 Besides, we also noticed that there is no complete data in 2022, so we deleted the data from 2022 as well.After removal, there are 78,888 rows remaining in the dataset.

df<-subset(df, year(dt)!=2012)
df<-subset(df, year(dt)!=2022)
glimpse(df)
Rows: 78,888
Columns: 3
$ dt         <dttm> 2013-01-01 00:00:00, 2013-01-01 01:00:00, 2013-0…
$ load_area  <chr> "AEP", "AEP", "AEP", "AEP", "AEP", "AEP", "AEP", …
$ energy_use <int> 14327, 14190, 13785, 13730, 13820, 13938, 14276, …

Check the chart again

The data looks better than the previous one. In the end, we keep the data from 2013 to 2022 and 3 variables which include DateTime, load_area, and estimated_load_hourly.

ggplot(df, aes(x=dt,y=energy_use))+geom_line()

Export new file

write.table(df,file="/Users/meizhuchen/Desktop/Columbia/Columbia_S2/APAN_5205/Project//new data.csv",sep=",",row.names=F, na = "NA",append=TRUE,col.names=FALSE)

Aggregate the hourly data into daily data

df1=df
df1$dt<-as.Date(df1$dt)
df1<-aggregate(energy_use~dt, df1, sum)
glimpse(df1)
Rows: 3,287
Columns: 2
$ dt         <date> 2013-01-01, 2013-01-02, 2013-01-03, 2013-01-04, …
$ energy_use <int> 363696, 422097, 434357, 428915, 390983, 366413, 4…

Aggregate the daily data into monthly data

df2=df1
df2$dt <- format(as.Date(df1$dt), "%Y-%m")
df2<-aggregate(energy_use~dt, df2, sum)
glimpse(df2)
Rows: 108
Columns: 2
$ dt         <chr> "2013-01", "2013-02", "2013-03", "2013-04", "2013…
$ energy_use <int> 12468894, 11386614, 11864582, 9933289, 10362414, …

Build up a times series dataset

#By day
ts1<-ts(data=df1$energy_use,
    start=c(lubridate::year(min(df1$dt)), lubridate::yday(min(df1$dt))),
    frequency=365)
ts_info(ts1)
 The ts1 series is a ts object with 1 variable and 3287 observations
 Frequency: 365 
 Start time: 2013 1 
 End time: 2022 2 
#By month
df2<-subset(df2, select = c(-dt))
ts2<-ts(df2,start = c(2013,01), end = c(2021,12),frequency = 12)
class(ts2)
[1] "ts"
ts_info(ts2)
 The ts2 series is a ts object with 1 variable and 108 observations
 Frequency: 12 
 Start time: 2013 1 
 End time: 2021 12 

Add more time features into the datasets.

# Add hour feature
df<-df %>%
  mutate(hour=hour(dt))

# Add weekdays and holiday features
df1<-df1 %>%
  mutate(weekdays=weekdays(dt),
         holiday=isHoliday("UnitedStates", as.Date(df1$dt)))

EDA

By hour

ggplot(df, aes(hour, energy_use)) + 
  geom_bar(stat='identity', aes(fill = hour)) +
  xlab("hour") +
  ylab("energy") 

By weekdays

ggplot(df1, aes(weekdays, energy_use)) + 
  geom_bar(stat='identity', aes(fill = weekdays)) +
  xlab("weekdays") +
  ylab("energy") 

By holidays

ggplot(df1, aes(holiday, energy_use)) + 
  geom_bar(stat='identity', aes(fill = holiday)) +
  xlab("holiday or not") +
  ylab("energy") 

By day

ts_plot(ts1, title="Hourly Energy Consumption - AEP")

Decompose

Heatmap

ts_heatmap(ts1)

By month

ts_quantile(df1, period="monthly")

By month

By month

ggseasonplot(ts2,polar=T)

Data Modeling

Split the data into train and test

Since the test data contains 24 months, we will be constructing forecasts for 24 periods.

train = window(ts2,end=c(2019,12))
test = window(ts2, start=c(2020,01))
length(test)
[1] 24

Simple Forecaecasting methods

Method

average_model = meanf(train,h = 24) # Average Method
naive_model = naive(train,h=24) #Naive Method
seasonal_naive_model = snaive(train,h=24) #Seasonal Native Method
drift_model = rwf(train,h=24,drift = T) # Drift Method

Accuracy

rbind(average_model = accuracy(f = average_model,x = ts2)[2,],
      naive_model = accuracy(f = naive_model,x = ts2)[2,],
      seasonal_naive_model = accuracy(f = seasonal_naive_model,x = ts2)[2,],
      drift_model = accuracy(f = drift_model,x = ts2)[2,]
      )
                            ME      RMSE      MAE       MPE     MAPE
average_model        -422351.3 1067452.5 913989.7 -4.959097 9.133660
naive_model          -600494.1 1149638.0 973370.6 -6.676168 9.828396
seasonal_naive_model -276488.2  573660.7 475508.8 -2.850343 4.622409
drift_model          -389431.9 1072761.5 906427.9 -4.655948 9.053743
                          MASE       ACF1 Theil's U
average_model        1.8688875  0.4432732 1.0995476
naive_model          1.9903071  0.4432732 1.1945507
seasonal_naive_model 0.9723002 -0.3452301 0.5355287
drift_model          1.8534256  0.4688047 1.1081050

Visusalization Forecasts

autoplot(train)+
  autolayer(average_model,PI = F,size=1.1,series = 'Average Model')+
  autolayer(naive_model,PI=F,size=1.1, series='Naive Model')+
  autolayer(seasonal_naive_model,PI=F,size=1.1,series='Seasonal Naive Model')+
  autolayer(drift_model,PI=F,size=1.1,series='Drift Model')+
  autolayer(test)

Exponential Smoothing Models

Forecasts are weighted averages of past observations with the weights decaying exponentially such that recent observations get weighted more than distant observations.

Method

ses_model = ses(train,h = 24) # Simple exponential smoothing, calculated using weighted averages, most recent observations get the heaviest weight.
holt_model = holt(train,h=24) # Holt’s Method, extends simple exponential smoothing to allow the forecasting of data with a trend
holt_damped_model = holt(train,h=24,damped = T) #Holt’s Method with Damping, forecasts generally display a constant trend indefinitely into the future.

Accuracy

rbind(ses_model = accuracy(f = ses_model,x = ts2)[2,],
      holt_model = accuracy(f = holt_model,x = ts2)[2,],
      holt_damped_model = accuracy(f = holt_damped_model,x = ts2)[2,]
      )
                         ME    RMSE      MAE       MPE     MAPE
ses_model         -346500.0 1039777 890174.6 -4.227986 8.851409
holt_model        -395029.8 1070486 904443.3 -4.707237 9.036634
holt_damped_model -205592.2 1002228 865494.4 -2.870435 8.509174
                      MASE      ACF1 Theil's U
ses_model         1.820191 0.4432732  1.065192
holt_model        1.849367 0.4632745  1.105293
holt_damped_model 1.769726 0.4442958  1.013739

Visusalization Forecasts

autoplot(train)+
  autolayer(ses_model,PI = F,size=1.1,series = 'ses_model')+
  autolayer(holt_model,PI=F,size=1.1, series='holt_model')+
  autolayer(holt_damped_model,PI=F,size=1.1,series='holt_damped_model')
  autolayer(test)
mapping: group = ~series, colour = ~series, x = ~timeVal, y = ~seriesVal 
geom_line: na.rm = FALSE, orientation = NA
stat_identity: na.rm = FALSE
position_identity 

ETS Models

ETS models in R are handled by ets() from library(forecast). Unlike functions such as naive(), ses(), hw() functions, the ets() function does not produce forecasts. Rather, it estimates the model parameters and returns information on the fitted model.

Method

# When only the time-series is specified, and all other arguments are left at their default values, then ets() will automatically select the best model based on AICc.
ets_auto = ets(train) 
summary(ets_auto)
ETS(M,N,M) 

Call:
 ets(y = train) 

  Smoothing parameters:
    alpha = 0.2632 
    gamma = 3e-04 

  Initial states:
    l = 11085975.5404 
    s = 1.0534 0.9544 0.9075 0.9479 1.0676 1.0758
           0.9915 0.9199 0.8809 1.0238 1.0155 1.1619

  sigma:  0.0422

     AIC     AICc      BIC 
2576.918 2583.976 2613.380 

Training set error measures:
                    ME     RMSE      MAE        MPE     MAPE     MASE
Training set -14904.15 430291.9 340718.1 -0.2595381 3.110457 0.696686
                  ACF1
Training set 0.1788073

Examine the residuals.

When only the time-series is specified, and all other arguments are left at their default values, then ets() will automatically select the best model based on AICc.

checkresiduals(ets_auto)


    Ljung-Box test

data:  Residuals from ETS(M,N,M)
Q* = 25.35, df = 3, p-value = 1.305e-05

Model df: 14.   Total lags used: 17

Accuracy

ets_auto_forecast = forecast(ets_auto,h=24)
accuracy(ets_auto_forecast ,x = ts2)
                     ME     RMSE      MAE        MPE     MAPE
Training set  -14904.15 430291.9 340718.1 -0.2595381 3.110457
Test set     -306414.16 542260.1 437906.4 -3.1575038 4.275699
                  MASE      ACF1 Theil's U
Training set 0.6966860 0.1788073        NA
Test set     0.8954126 0.1208285 0.5027521

Visusalization Forecasts

autoplot(train)+
  autolayer(ets_auto_forecast,series="ETS - MAM (auto)",PI=F)+
  autolayer(test)

ARIMA

ARIMA are the one most widely used approaches to time-series forecasting and provide complementary approaches to the problem. ARIMA models aim to describe autocorrelations in the data

Check if it’s stationary or not

adf.test(ts2,k = 0)

    Augmented Dickey-Fuller Test

data:  ts2
Dickey-Fuller = -6.9447, Lag order = 0, p-value = 0.01
alternative hypothesis: stationary

Automatic Model Selection

Use auto.arima to pick the best model based on AICc, sometimes it does not yield an optimal solution as it uses computational shortcuts, by setting stepwise and approximation to False, we will ensure a more extensive search.

model_auto = auto.arima(y = train,d = 1,D = 1,stepwise = F,approximation = F)
model_auto
Series: train 
ARIMA(1,1,1)(0,1,1)[12] 

Coefficients:
         ar1      ma1     sma1
      0.3659  -0.9522  -0.7198
s.e.  0.1358   0.0931   0.1870

sigma^2 = 2.495e+11:  log likelihood = -1036.44
AIC=2080.88   AICc=2081.49   BIC=2089.93

Model

model1 = Arima(y = train,order = c(1,1,1),seasonal = c(0,1,1),lambda = BoxCox.lambda(train))
ggtsdisplay(residuals(model1))

Accuracy

model1_forecast=forecast(model1,h = 24)
accuracy(model1_forecast, x=ts2)
                     ME     RMSE      MAE        MPE     MAPE
Training set  -14698.14 448186.7 333727.5 -0.2259307 3.062610
Test set     -245988.78 515513.3 432957.8 -2.5971909 4.229228
                  MASE         ACF1 Theil's U
Training set 0.6823920  0.005927225        NA
Test set     0.8852938 -0.034303574 0.4908497

Visusalization Forecasts

autoplot(forecast(model1,h = 24),PI=F)+
  autolayer(test,size=1)

Comparing Forecasting Models

Compare all the models in terms of accuracy metrics on the test sample.

rbind(average_model = accuracy(f = average_model,x = ts2)[2,],
      naive_model = accuracy(f = naive_model,x = ts2)[2,],
      seasonal_naive_model = accuracy(f = seasonal_naive_model,x = ts2)[2,],
      drift_model = accuracy(f = drift_model,x = ts2)[2,],
      ses_model = accuracy(f = ses_model,x =  ts2)[2,],
      holt_model = accuracy(f = holt_model,x = ts2)[2,],
      holt_damped_model = accuracy(f = holt_damped_model,x =  ts2)[2,],
      ets_auto = accuracy(ets_auto_forecast,x =  ts2)[2,],
      arima = accuracy(model1_forecast,x= ts2)[2,]
      )
                            ME      RMSE      MAE       MPE     MAPE
average_model        -422351.3 1067452.5 913989.7 -4.959097 9.133660
naive_model          -600494.1 1149638.0 973370.6 -6.676168 9.828396
seasonal_naive_model -276488.2  573660.7 475508.8 -2.850343 4.622409
drift_model          -389431.9 1072761.5 906427.9 -4.655948 9.053743
ses_model            -346500.0 1039777.2 890174.6 -4.227986 8.851409
holt_model           -395029.8 1070485.6 904443.3 -4.707237 9.036634
holt_damped_model    -205592.2 1002228.2 865494.4 -2.870435 8.509174
ets_auto             -306414.2  542260.1 437906.4 -3.157504 4.275699
arima                -245988.8  515513.3 432957.8 -2.597191 4.229228
                          MASE        ACF1 Theil's U
average_model        1.8688875  0.44327322 1.0995476
naive_model          1.9903071  0.44327322 1.1945507
seasonal_naive_model 0.9723002 -0.34523015 0.5355287
drift_model          1.8534256  0.46880473 1.1081050
ses_model            1.8201914  0.44327322 1.0651918
holt_model           1.8493674  0.46327447 1.1052930
holt_damped_model    1.7697264  0.44429585 1.0137387
ets_auto             0.8954126  0.12082848 0.5027521
arima                0.8852938 -0.03430357 0.4908497
autoplot(train, color='sienna')+
  autolayer(test,size=1.05,color='seagreen2')+
  autolayer(average_model,series = 'Average Model',PI=F)+
  autolayer(naive_model,series = 'Naive Model',PI=F)+
  autolayer(seasonal_naive_model,series = 'Seasonal Naive Model',PI=F)+
  autolayer(drift_model,series = 'Seasonal Naive Model',PI=F)+
  autolayer(ses_model,series = 'Seasonal Naive Model',PI=F)+
  autolayer(holt_model,series = 'Holt',PI=F)+
  autolayer(ets_auto_forecast,series = 'ETS Auto',PI=F)+
  autolayer(model1_forecast,series = 'ARIMA',PI=F)

Forcast For next 12 Months.

model2 = Arima(y = ts2,order = c(1,1,1),seasonal = c(0,1,1),lambda = BoxCox.lambda(ts2))
forecast(model2,h =12)
         Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
Jan 2022       11901306 11294253 12486455 10963026 12788203
Feb 2022       10577829  9870137 11252377  9480051 11597697
Mar 2022       10493936  9775360 11178104  9378866 11528103
Apr 2022        9035803  8225966  9795567  7772724 10180713
May 2022        9549752  8774258 10281569  8342722 10653869
Jun 2022       10357495  9629766 11049510  9227752 11403243
Jul 2022       11532923 10863426 12175056 10496495 12505105
Aug 2022       11362184 10684911 12011062 10313348 12344340
Sep 2022        9917097  9164110 10630296  8746580 10993952
Oct 2022        9421228  8637142 10160112  8200229 10535684
Nov 2022        9970138  9220106 10680886  8804408 11043402
Dec 2022       11044554 10351747 11706856  9970899 12046546